Helper functions
bin_numeric <- function(x){
bins <- 3
pctiles <- 1:(bins-1) * 1 / (bins)
set.seed(42)
x <- x + runif(length(x)) * 1e-6
breaks <- quantile(x, c(0, pctiles, 1))
x_binned <- cut(x, breaks, include.lowest = TRUE)
levels(x_binned) <- c('low', 'med', 'high')
if(anyNA(x)){
x_binned <- addNA(x_binned, ifany = TRUE)
levels(x_binned) <- c('low', 'med', 'high', 'UNK')
}
as.character(x_binned)
}
cluster_plot <- function(x, y, c){
library(ggplot2)
plotdat <- data.table(x, y, cluster=c)
plotdat[,cluster := factor(cluster)]
colors <- c(
"#1f78b4", "#ff7f00", "#6a3d9a", "#33a02c", "#e31a1c", "#b15928",
"#a6cee3", "#fdbf6f", "#cab2d6", "#b2df8a", "#fb9a99", "#ffff99", "black",
"grey"
)
colors <- rep(colors, 100)
ggplot(plotdat, aes(x=x, y=y, color=cluster)) +
geom_point() + theme_bw() +
scale_color_manual(values=colors)
}
fast_mode <- function(x){
library(data.table)
out <- data.table(x, key='x')
out <- out[,list(.N), by='x']
setorder(out, -N, x)
return(out[1,x])
}
cluster_summary <- function(dt, c, bin=TRUE){
library(data.table)
library(stringi)
dat <- copy(dt)
is_num <- sapply(dat, is.numeric)
nums <- names(dat)[is_num]
chars <- names(dat)[!is_num]
if(bin){
for(var in nums){
set(dat, j=var, value=bin_numeric(dat[[var]]))
}
chars <- names(dat)
}
dat <- data.table(dat, cluster=c)
num_dat <- dat[,c(nums, 'cluster'), with=FALSE]
char_dat <- dat[,c(chars, 'cluster'), with=FALSE]
setkeyv(num_dat, 'cluster')
setkeyv(char_dat, 'cluster')
means <- num_dat[,lapply(.SD, function(x) round(mean(x), 1)), by='cluster']
modes <- char_dat[,lapply(.SD, fast_mode), by='cluster']
stopifnot(means[['cluster']] == modes[['cluster']])
cluster <- means[['cluster']]
means <- means[,-1,with=F]
modes <- modes[,-1,with=F]
means <- means[,order(sapply(means, sd), decreasing=TRUE),with=FALSE]
modes <- modes[,order(sapply(modes, function(x) length(unique(x))), decreasing=TRUE),with=FALSE]
mean_of_means <- sapply(means, mean)
sd_of_means <- sapply(means, sd)
mode_of_modes <- sapply(modes, fast_mode)
mean_labels <- apply(means, 1, function(x){
labels <- paste(names(x), x, sep='=')
above <- x > mean_of_means + sd_of_means
below <- x < mean_of_means - sd_of_means
either <- above | below
diff <- abs(x-mean_of_means)
order <- order(diff, decreasing=TRUE)
labels <- labels[order]
either <- either[order]
out <- paste(labels[either], collapse='; ')
return(out)
})
mode_labels <- apply(modes, 1, function(x){
labels <- paste(names(x), x, sep='=')
keep <- x != mode_of_modes
out <- paste(labels[keep], collapse='; ')
return(out)
})
if(bin){
labels <- mode_labels
} else{
labels <- paste(mean_labels, mode_labels)
}
labels <- stri_trim_both(labels)
out <- data.table(
cluster = factor(cluster, labels=labels),
means,
modes
)
keep <- sapply(out, function(x) length(unique(x))) > 1
return(out[,keep,with=FALSE])
}
cluster_outliers <- function(dat, c){
}
tune_with_kmeans <- function(d, c){
centers <- data.table(d, cluster=c, key='cluster')
centers <- centers[,lapply(.SD, mean), by='cluster']
centers[,cluster := NULL]
kmeans(d, as.matrix(centers))
}
Load the data
library(data.table)
db <- fread('https://s3.amazonaws.com/datarobot_data_science/test_data/10k_diabetes_train80.csv')
keep <- sapply(db, function(x) length(unique(x))) > 1
db <- db[,keep,with=FALSE]
text <- c('diag_1_desc', 'diag_2_desc', 'diag_3_desc')
db <- db[,setdiff(names(db), c(text, 'readmitted')), with=FALSE]
Pre-process non-text
library(Matrix)
db_matrix <- sparse.model.matrix(~ 0 + ., db)
db_matrix[1:10,1:7]
| 0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 0 |
1 |
0 |
0 |
0 |
0 |
1 |
| 0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 0 |
0 |
0 |
1 |
0 |
0 |
1 |
| 0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 0 |
0 |
0 |
1 |
0 |
0 |
0 |
SVD on the sparse matrix
library(RSpectra)
library(irlba)
svd_model <- irlba(db_matrix, nu=50, nv=0, verbose=TRUE, center=colMeans(db_matrix))
pca <- svd_model$u
colnames(pca) <- paste0('pc', 1:ncol(pca))
pca[1:5, 1:5]
| 0.0229048 |
-0.0082658 |
0.0348017 |
0.0030067 |
0.0038466 |
| 0.0020547 |
0.0167734 |
0.0012109 |
-0.0143895 |
-0.0120152 |
| 0.0063838 |
0.0004847 |
-0.0140442 |
-0.0074701 |
-0.0344998 |
| -0.0048649 |
0.0161120 |
0.0014843 |
-0.0216935 |
-0.0109613 |
| -0.0054193 |
0.0168193 |
0.0073643 |
0.0113918 |
-0.0272952 |
kmeans
library(fpc)
set.seed(42)
kmeans <- kmeansruns(pca, critout=TRUE, criterion='ch', krange=3)
clust <- kmeans$cluster
table(clust)
cluster_plot(pca[,1], pca[,2], clust)

tsne + kmeans
library(Rtsne)
library(ggplot2)
library(scales)
set.seed(42)
tsne <- Rtsne(pca, 2, pca=FALSE, check_duplicates=FALSE, verbose=TRUE)
km2 <- kmeansruns(tsne$Y, critout=TRUE, criterion='ch', krange=2:50)
clust <- km2$cluster
cluster_plot(tsne$Y[,1], tsne$Y[,2], clust)

tsne + pam
pam <- pamk(tsne$Y, criterion="asw", critout=TRUE, usepam=FALSE, krange=2:50)
clust <- pam$pamobject$clustering
cluster_plot(tsne$Y[,1], tsne$Y[,2], clust)

tsne + dbscan (cluster 0 is outliers)
library(dbscan)
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:fpc':
##
## dbscan
dbs <- dbscan::dbscan(tsne$Y, 2.25, minPts=47, splitRule='suggest', borderPoints=TRUE)
clust <- dbs$cluster
cluster_plot(tsne$Y[,1], tsne$Y[,2], clust)

tsne + dbscan + kmeans
set.seed(42)
dbscan_kmeans <- tune_with_kmeans(tsne$Y, dbs$cluster)
clust <- dbscan_kmeans$cluster
cluster_plot(tsne$Y[,1], tsne$Y[,2], clust)

tsne + optics (cluster 0 is outliers)
library(dbscan)
opt <- dbscan::optics(tsne$Y, eps=5, minPts=100)
opt <- optics_cut(opt, eps_cl = 3.25)
plot(opt)

hullplot(tsne$Y, opt)

clust <- opt$cluster
table(clust)
cluster_plot(tsne$Y[,1], tsne$Y[,2], clust)

tsne + optics + kmeans
set.seed(42)
dbscan_kmeans <- tune_with_kmeans(tsne$Y, opt$cluster)
clust <- dbscan_kmeans$cluster
cluster_plot(tsne$Y[,1], tsne$Y[,2], clust)

outliers